What are common characteristics of employees lost in attrition compared to those who stay in IBM’s fictional dataset?

Estimated reading time: 30 minutes

What are common characteristics of employees lost in attrition compared to those who stay in IBM’s fictional dataset?

We will be using point plots, box plots, kernel density diagrams, means, standard deviations, and z-tests to explore this question.

Set Up Dataset

from pandas import read_csv
data = read_csv("data/attrition.csv")

target = "Attrition"

feature_by_dtype = {}
for c in data.columns:
    
    if c == target: continue
    
    data_type = str(data[c].dtype)
    
    if data_type not in feature_by_dtype.keys():
         feature_by_dtype[data_type] = [c]
    else:
        feature_by_dtype[data_type].append(c)

feature_by_dtype
feature_by_dtype.keys()

dict_keys(['int64', 'object'])

objects = feature_by_dtype["object"]

remove = ["Over18"]

import pandas as pd

pd.options.display.max_columns = None

data.head()

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EmployeeNumber	EnvironmentSatisfaction	Gender	HourlyRate	JobInvolvement	JobLevel	JobRole	JobSatisfaction	MaritalStatus	MonthlyIncome	MonthlyRate	NumCompaniesWorked	Over18	OverTime	PercentSalaryHike	PerformanceRating	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	1	1	2	Female	94	3	2	Sales Executive	4	Single	5993	19479	8	Y	Yes	11	3	1	80	0	8	0	1	6	4	0	5
1	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	1	2	3	Male	61	2	2	Research Scientist	2	Married	5130	24907	1	Y	No	23	4	4	80	1	10	3	3	10	7	1	7
2	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	1	4	4	Male	92	2	1	Laboratory Technician	3	Single	2090	2396	6	Y	Yes	15	3	2	80	0	7	3	3	0	0	0	0
3	33	No	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	1	5	4	Female	56	3	1	Research Scientist	3	Married	2909	23159	1	Y	Yes	11	3	3	80	0	8	3	3	8	7	3	0
4	27	No	Travel_Rarely	591	Research & Development	2	1	Medical	1	7	1	Male	40	3	1	Laboratory Technician	2	Married	3468	16632	9	Y	No	12	3	4	80	1	6	3	3	2	2	2	2

categorical_features = [f for f in objects if f not in remove]

int64s = feature_by_dtype["int64"]
## handeling feature types in dictionary

remove.append("StandardHours")
remove.append("EmployeeCount")

count_features = []
for i in [i for i in int64s if len(data[i].unique()) < 20 and i not in remove]:
    count_features.append(i)

count_features = count_features #+ ["TotalWorkingYears", "YearsAtCompany", "HourlyRate"]

remove.append("EmployeeNumber")

numerical_features = [i for i in int64s if i not in remove]

Numerical Features

data[numerical_features].head()

	Age	DailyRate	DistanceFromHome	Education	EnvironmentSatisfaction	HourlyRate	JobInvolvement	JobLevel	JobSatisfaction	MonthlyIncome	MonthlyRate	NumCompaniesWorked	PercentSalaryHike	PerformanceRating	RelationshipSatisfaction	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	1102	1	2	2	94	3	2	4	5993	19479	8	11	3	1	0	8	0	1	6	4	0	5
1	49	279	8	1	3	61	2	2	2	5130	24907	1	23	4	4	1	10	3	3	10	7	1	7
2	37	1373	2	2	4	92	2	1	3	2090	2396	6	15	3	2	0	7	3	3	0	0	0	0
3	33	1392	3	4	4	56	3	1	3	2909	23159	1	11	3	3	0	8	3	3	8	7	3	0
4	27	591	2	1	1	40	3	1	2	3468	16632	9	12	3	4	1	6	3	3	2	2	2	2

Python Source Code

def display_ttest(data, category, numeric):
    output = {}
    s1 = data[data[category] == data[category].unique()[0]][numeric]
    s2 = data[data[category] == data[category].unique()[1]][numeric]
    from scipy.stats import ttest_ind
    t, p = ttest_ind(s1,s2)
    from IPython.display import display
    from pandas import DataFrame
    display(DataFrame(data=[{"t-test statistic" : t, "p-value" : p}], columns=["t-test statistic", "p-value"], index=[category]).round(2))

def display_ztest(data, category, numeric):
    output = {}
    s1 = data[data[category] == data[category].unique()[0]][numeric]
    s2 = data[data[category] == data[category].unique()[1]][numeric]
    from statsmodels.stats.weightstats import ztest
    z, p = ztest(s1,s2)
    from IPython.display import display
    from pandas import DataFrame
    display(DataFrame(data=[{"z-test statistic" : z, "p-value" : p}], columns=["z-test statistic", "p-value"], index=[category]).round(2))
    
def display_cxn_analysis(data, category, numeric, target):
    
    from seaborn import boxplot, kdeplot, set_style, distplot, countplot
    from matplotlib.pyplot import show, figure, subplots, ylabel, xlabel, subplot, suptitle
    
    not_target = [a for a in data[category].unique() if a != target][0]
    
    pal = {target : "yellow",
          not_target : "darkgrey"}
    

    set_style("whitegrid")
    figure(figsize=(12,5))
    suptitle(numeric + " by " + category)

    # ==============================================
    
    p1 = subplot(2,2,2)
    boxplot(y=category, x=numeric, data=data, orient="h", palette = pal)
    p1.get_xaxis().set_visible(False)

    # ==============================================
    
    if(numeric in count_features):
        p2 = subplot(2,2,4)
        
        s2 = data[data[category] == not_target][numeric]
        s2 = s2.rename(not_target) 
        countplot(s2, color = pal[not_target])
        
        s1 = data[data[category] == target][numeric]
        s1 = s1.rename(target)
        ax = countplot(s1, color = pal[target])
        
        ax.set_yticklabels([ "{:.0f}%".format((tick/len(data)) * 100) for tick in ax.get_yticks()])
        
        ax.set_ylabel("Percentage")
        ax.set_xlabel(numeric)
        
    else:
        p2 = subplot(2,2,4, sharex=p1)
        s1 = data[data[category] == target][numeric]
        s1 = s1.rename(target)
        kdeplot(s1, shade=True, color = pal[target])
        #distplot(s1,kde=False,color = pal[target])

        s2 = data[data[category] == not_target][numeric]
        s2 = s2.rename(not_target)  
        kdeplot(s2, shade=True, color = pal[not_target])
        #distplot(s2,kde=False,color = pal[not_target])

        #ylabel("Density Function")
        ylabel("Distribution Plot")
        xlabel(numeric)
    
    # ==============================================
    
    p3 = subplot(1,2,1)
    from seaborn import pointplot
    from matplotlib.pyplot import rc_context

    with rc_context({'lines.linewidth': 0.8}):
        pp = pointplot(x=category, y=numeric, data=data, capsize=.1, color="black", marker="s")
        
    
    # ==============================================
    
    show()
    
    #display p value
    
    if(data[category].value_counts()[0] > 30 and data[category].value_counts()[1] > 30):
        display_ztest(data,category,numeric)
    else:
        display_ttest(data,category,numeric)
    
    #Means, Standard Deviation, Absolute Distance
    table = data[[category,numeric]]
    
    means = table.groupby(category).mean()
    stds = table.groupby(category).std()
    
    s1_mean = means.loc[data[category].unique()[0]]
    s1_std = stds.loc[data[category].unique()[0]]
    
    s2_mean = means.loc[data[category].unique()[1]]
    s2_std = means.loc[data[category].unique()[1]]
    
    print("%s Mean: %.2f (+/- %.2f)" % (category + " == " + str(data[category].unique()[0]),s1_mean, s1_std))
    print("%s Mean : %.2f (+/- %.2f)" % (category + " == " + str(data[category].unique()[1]), s2_mean, s2_std))
    print("Absolute Mean Diferrence Distance: %.2f" % abs(s1_mean - s2_mean))

def get_p_value(s1,s2):
    
    from statsmodels.stats.weightstats import ztest
    from scipy.stats import ttest_ind
    
    if(len(s1) > 30 & len(s2) > 30):
        z, p = ztest(s1,s2)
        return p
    else:
        t, p = ttest_ind(s1,s2)
        return p
    
def get_p_values(data, category, numerics):
    
    output = {}
    
    for numeric in numerics:
        s1 = data[data[category] == data[category].unique()[0]][numeric]
        s2 = data[data[category] == data[category].unique()[1]][numeric]
        row = {"p-value" : get_p_value(s1,s2)}
        output[numeric] = row
    
    from pandas import DataFrame
    
    return DataFrame(data=output).T

def get_statistically_significant_numerics(data, category, numerics):
    df = get_p_values(data, category, numerics)
    return list(df[df["p-value"] < 0.05].index)

def get_statistically_non_significant_numerics(data, category, numerics):
    df = get_p_values(data, category, numerics)
    return list(df[df["p-value"] >= 0.05].index)
    
def display_p_values(data, category, numerics):
    from IPython.display import display
    display(get_p_values(data, category, numerics).round(2).sort_values("p-value", ascending=False))

### TESTING

#Well this simply sees if there is a statistical difference between numeric feature's..
# .. distribution between the two attrition classes (Yes and No)
output = {}

for numeric in numerical_features:
    s1 = data[data[target] == data[target].unique()[0]][numeric]
    s2 = data[data[target] == data[target].unique()[1]][numeric]
    
    from statsmodels.stats.weightstats import ztest
    from scipy.stats import ttest_ind
    
    if(len(s1) > 30 & len(s2) > 30):
        # for this task this is always going to be true 
        # test for mean based on normal distribution, the samples are assumed
        # to be independent.
        z, p = ztest(s1,s2)
    else:
        t, p = ttest_ind(s1,s2)
    
    row = {"p-value" : p}
    output[numeric] = row

df = pd.DataFrame(data=output).T
df_sig = df[df["p-value"] < 0.05]

### TEST PASSED

significant = get_statistically_significant_numerics(data,target,numerical_features) 
ns = get_statistically_non_significant_numerics(data,target,numerical_features)

Statistically Significant Numerical Features

i = iter(significant)

The fictional company on average loses staff that are 3 - 4 years younger than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-6.18	0.0

Attrition == Yes Mean: 33.61 (+/- 9.69)
Attrition == No Mean : 37.56 (+/- 37.56)
Absolute Mean Diferrence Distance: 3.95

Employees lost in attrition tend to have lower daily rates than those who stay.

Each of the group are 180 degrees flipped from each other in their kernel density diagram

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-2.17	0.03

Attrition == Yes Mean: 750.36 (+/- 401.90)
Attrition == No Mean : 812.50 (+/- 812.50)
Absolute Mean Diferrence Distance: 62.14

Employees lost in attrition tend to have longer commute distances than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	2.99	0.0

Attrition == Yes Mean: 10.63 (+/- 8.45)
Attrition == No Mean : 8.92 (+/- 8.92)
Absolute Mean Diferrence Distance: 1.72

Employees lost in attrition are less satisfied with their work environment on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-3.98	0.0

Attrition == Yes Mean: 2.46 (+/- 1.17)
Attrition == No Mean : 2.77 (+/- 2.77)
Absolute Mean Diferrence Distance: 0.31

Employees lost in attrition are less involved with their jobs on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-5.02	0.0

Attrition == Yes Mean: 2.52 (+/- 0.77)
Attrition == No Mean : 2.77 (+/- 2.77)
Absolute Mean Diferrence Distance: 0.25

Employees lost in attrition tend to be lower in job level than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-6.57	0.0

Attrition == Yes Mean: 1.64 (+/- 0.94)
Attrition == No Mean : 2.15 (+/- 2.15)
Absolute Mean Diferrence Distance: 0.51

Employees who stay have more job satisfication than employees lost in attrition

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-3.99	0.0

Attrition == Yes Mean: 2.47 (+/- 1.12)
Attrition == No Mean : 2.78 (+/- 2.78)
Absolute Mean Diferrence Distance: 0.31

Employees lost in attrition tend to have lower monthly average income on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-6.2	0.0

Attrition == Yes Mean: 4787.09 (+/- 3640.21)
Attrition == No Mean : 6832.74 (+/- 6832.74)
Absolute Mean Diferrence Distance: 2045.65

Employees who stay tend to have more stock options than those lost in attrition.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-5.3	0.0

Attrition == Yes Mean: 0.53 (+/- 0.86)
Attrition == No Mean : 0.85 (+/- 0.85)
Absolute Mean Diferrence Distance: 0.32

Employees lost in attrition had less total working years than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-6.65	0.0

Attrition == Yes Mean: 8.24 (+/- 7.17)
Attrition == No Mean : 11.86 (+/- 11.86)
Absolute Mean Diferrence Distance: 3.62

Employees lost in attrition had less training opportunities than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-2.28	0.02

Attrition == Yes Mean: 2.62 (+/- 1.25)
Attrition == No Mean : 2.83 (+/- 2.83)
Absolute Mean Diferrence Distance: 0.21

Employees lost in attrition had poorer work-life balance on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-2.45	0.01

Attrition == Yes Mean: 2.66 (+/- 0.82)
Attrition == No Mean : 2.78 (+/- 2.78)
Absolute Mean Diferrence Distance: 0.12

Employees who stay had longer organization tenure than those lost in attrition by 2 years on average.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-5.2	0.0

Attrition == Yes Mean: 5.13 (+/- 5.95)
Attrition == No Mean : 7.37 (+/- 7.37)
Absolute Mean Diferrence Distance: 2.24

Employees who stayed had 1 - 2 more years in their current role than those lost in attrition.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-6.23	0.0

Attrition == Yes Mean: 2.90 (+/- 3.17)
Attrition == No Mean : 4.48 (+/- 4.48)
Absolute Mean Diferrence Distance: 1.58

Employees lost in attrition had less time with their current manager by 1 - 2 years on average than those who stay.

display_cxn_analysis(data, target, next(i), "Yes")

png

	z-test statistic	p-value
Attrition	-6.06	0.0

Attrition == Yes Mean: 2.85 (+/- 3.14)
Attrition == No Mean : 4.37 (+/- 4.37)
Absolute Mean Diferrence Distance: 1.52

Employees who stay are more satisfied with their work environment on average than those who leave.

Non-Significant Features

ns

['Education',
 'HourlyRate',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'YearsSinceLastPromotion']

### Some Additional Visualisations

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

dataset = data

# Define a set of graphs, 3 by 5, usin the matplotlib library
f, axes = plt.subplots(5, 3, figsize=(24, 36), sharex=False, sharey=False)

# Define a few seaborn graphs, which for the most part only need the "dataset", the "x and "y" axis and the position. 
# You can also show a third value and expand your analysis by setting the "hue" property.
sns.swarmplot(x="EducationField", y="MonthlyIncome", data=dataset, hue="Gender", ax=axes[0,0])
axes[0,0].set( title = 'Monthly income against Educational Field')

sns.pointplot(x="PerformanceRating", y="JobSatisfaction", data=dataset, hue="Gender", ax=axes[0,1])
axes[0,1].set( title = 'Job satisfaction against Performance Rating')

sns.barplot(x="NumCompaniesWorked", y="PerformanceRating", data=dataset, ax=axes[0,2])
axes[0,2].set( title = 'Number of companies worked against Performance rating')

sns.barplot(x="JobSatisfaction", y="EducationField", data=dataset, ax=axes[1,0])
axes[1,0].set( title = 'Educational Field against Job Satisfaction')

sns.barplot(x="YearsWithCurrManager", y="JobSatisfaction", data=dataset, ax=axes[1,1])
axes[1,1].set( title = 'Years with current Manager against Job Satisfaction')

sns.pointplot(x="JobSatisfaction", y="MonthlyRate", data=dataset, ax=axes[1,2])
axes[1,2].set( title = 'Job Satisfaction against Monthly rate')

sns.barplot(x="WorkLifeBalance", y="DistanceFromHome", data=dataset, ax=axes[2,0])
axes[2,0].set( title = 'Distance from home against Work life balance')

sns.pointplot(x="OverTime", y="WorkLifeBalance", hue="Gender", data=dataset, jitter=True, ax=axes[2,1])
axes[2,1].set( title = 'Work life balance against Overtime')

sns.pointplot(x="OverTime", y="RelationshipSatisfaction", hue="Gender", data=dataset, ax=axes[2,2])
axes[2,2].set( title = 'Overtime against Relationship satisfaction')

sns.pointplot(x="MaritalStatus", y="YearsInCurrentRole", hue="Gender", data=dataset, ax=axes[3,0])
axes[3,0].set( title = 'Marital Status against Years in current role')

sns.pointplot(x="Age", y="YearsSinceLastPromotion", hue="Gender", data=dataset, ax=axes[3,1])
axes[3,1].set( title = 'Age against Years since last promotion')

sns.pointplot(x="OverTime", y="PerformanceRating", hue="Gender", data=dataset, ax=axes[3,2])
axes[3,2].set( title = 'Performance Rating against Overtime')

sns.barplot(x="Gender", y="PerformanceRating", data=dataset, ax=axes[4,0])
axes[4,0].set( title = 'Performance Rating against Gender')

sns.barplot(x="Gender", y="JobSatisfaction", data=dataset, ax=axes[4,1])
axes[4,1].set( title = 'Job satisfaction against Gender')

sns.countplot(x="Attrition", data=dataset, ax=axes[4,2])
axes[4,2].set( title = 'Attrition distribution')

[<matplotlib.text.Text at 0x10b8ab9e8>]

png